use "${rdata}survey_shuffledid.dta", clear

*** drop repetitions if more than 1 observation ./ no data on gender****
sort userid sce_date
by userid: gen howmany = _N
by userid: gen first = _n==1
by userid: gen second = _n==2

sort userid sce_date
by userid: egen firstobstr = max(first==1 & treatment==1 & howmany==2)
drop if firstobstr==1 & second==1

drop if male==.

local titles "Mother Earns 15% Less" "Mother Earns Same" "Mother Earns 15% More" ///
	"Mother Earns 30% More" "Mother Earns 50% More"
	
forvalues i=1(1)5 {
	gen so_mother_time_off_`i' = . 
	replace so_mother_time_off_`i'= qgn2newa_`i' if male == 0
	replace so_mother_time_off_`i'= qgn2newb_`i' if male == 1
	}

gen so_mother_reject = . 
replace so_mother_reject = qgn0j_1 if male == 0
replace so_mother_reject = qgn0j_2 if male == 1

gen has_all_qs2_fo = !mi( qs2_1, qs2_2, qs2_3, qs2_4, qs2_5 )
gen has_all_qs2_so = !mi( so_mother_time_off_1, so_mother_time_off_2, so_mother_time_off_3, so_mother_time_off_4, so_mother_time_off_5 )


forvalues i = 1/5 {
	replace qs2_`i' = . if !( has_all_qs2_fo & has_all_qs2_so )
	replace so_mother_time_off_`i' = . if !( has_all_qs2_fo & has_all_qs2_so )
}
gen all=1
gen qs2_0 = qs1
gen so_mother_time_off_0 = so_mother_reject

replace qs2_0=. if so_mother_time_off_0==.|qs2_0==.

egen group = group(state male)

forvalues i=0(1)5 { 
	gen data_`i' = qs2_`i'~=. 
}


********* drop if no data for either of the hypothetical questions
drop if data_0==0&data_1==0

save "${ddata}temp.dta", replace




**************************************************************
* 2. Construct the shares for ACS and survey data respectively
**************************************************************
******* ACS 2021 1% *******

cap confirm file "${ddata}ACS2021_raw.dta"
if _rc {
do "${code}usa_00021"
save "${ddata}ACS2021_raw.dta", replace
/*do "${code}setup_acs"
drop if year==2022
save "${ddata}ACS2021_raw.dta", replace*/
}

use "${ddata}ACS2021_raw.dta", clear
* keep houshold heads to be consistent with the SCE sample
keep if relate==1

// Income groups (<30K, 30k-50k, 50k-100k, 100k+, unclassified)
/*
. count if relate==1 &  ftotinc==9999999
  0

. count if ftotinc==9999999
  163,874

*/
gen income_grp = .
replace income_grp = 0 if ftotinc==9999999 //unclassified
replace income_grp = 1 if ftotinc>100000  & ftotinc!=9999999
replace income_grp = 2 if ftotinc> 50000  & ftotinc<=100000
replace income_grp = 3 if ftotinc> 30000  & ftotinc<= 50000
replace income_grp = 4 if ftotinc<=30000 
	label define income_group 0 "unclassified" 1 "100k+" 2 "50k-100k" 3 "30k-50k" 4 "<30k"
	label values income_grp income_group


// Education groups (HS or less, Some college, College grad+, unclassified) 
gen educ_grp = .
//replace educ_grp = 0 if educd==1
replace educ_grp = 1 if educd< 65  & educd!=1
replace educ_grp = 2 if educd>=65  & educd<101
replace educ_grp = 3 if educd>=101 
label define educ_group 0 "unclassified" 1 "HS or less" 2 "Some college" 3 "College grad+"
label values educ_grp educ_group

gen college = .
replace college = 1 if educd>=101
replace college = 0 if educd<101 & educd!=1

// Region (Northeast, Midwest, South, West)
gen current_region = ""
replace current_region = "region_midwest" if region==21|region==22
replace current_region = "region_northeast" if region==11|region==12
replace current_region = "region_south" if region==31|region==32|region==33
replace current_region = "region_west" if region==41|region==42

/*
. label list region_lbl
region_lbl:
          11 New England Division
          12 Middle Atlantic Division
          13 Mixed Northeast Divisions (1970 Metro)
          21 East North Central Div.
          22 West North Central Div.
          23 Mixed Midwest Divisions (1970 Metro)
          31 South Atlantic Division
          32 East South Central Div.
          33 West South Central Div.
          34 Mixed Southern Divisions (1970 Metro)
          41 Mountain Division
          42 Pacific Division
          43 Mixed Western Divisions (1970 Metro)
          91 Military/Military reservations
          92 PUMA boundaries cross state lines-1% sample
          97 State not identified
          99 Not identified
*/

// Age groups (<30, 30-39, 40-49, 50-59, 60+)
gen age_grp = 0
replace age_grp = 1 if age<30
replace age_grp = 2 if age>=30 & age<40
replace age_grp = 3 if age>=40 & age<50
replace age_grp = 4 if age>=50 & age<60
replace age_grp = 5 if age>=60 

// Sex (female=2, male=1)
gen male=(sex==1)

gen all=1

preserve
	* obtain average income for groups defined by education, region, age, and sex, to be merged with SCE later
	collapse (mean) avg_hh_income = ftotinc  [aw=perwt], by(educ_grp current_region age_grp male)
	save "${ddata}avg_faminc_bygroup_ACS.dta", replace

restore
	collapse (count) num_cat = all [aw=perwt], by(income_grp educ_grp current_region age_grp male)
	sum num_cat
	gen share_ACS = num_cat/r(sum)

	save "${ddata}share_ACS.dta", replace


******* Back to Survey Data *******

use "${ddata}temp.dta", clear

gen region = .
replace region = 11 if current_censusdiv=="New England"
replace region = 12 if current_censusdiv=="Middle Atlantic"
replace region = 21 if current_censusdiv=="E. N. Central"
replace region = 22 if current_censusdiv=="W. N. Central"
replace region = 31 if current_censusdiv=="South Atlantic"
replace region = 32 if current_censusdiv=="E. S. Central"
replace region = 33 if current_censusdiv=="W. S. Central"
replace region = 41 if current_censusdiv=="Mountain"
replace region = 42 if current_censusdiv=="Pacific"

// Age groups (<30, 30-39, 40-49, 50-59, 60+)
gen age_grp = 0
replace age_grp = 1 if age<30
replace age_grp = 2 if age>=30 & age<40
replace age_grp = 3 if age>=40 & age<50
replace age_grp = 4 if age>=50 & age<60
replace age_grp = 5 if age>=60 

// Sex (female, male)
// var: male


	merge m:1  educ_grp current_region age_grp male using "${ddata}avg_faminc_bygroup_ACS.dta", keepusing(avg_hh_income)
	keep if _merge==3
	drop _merge

	cap drop imputed_hh_income
	gen imputed_hh_income = hh_income if hh_income!=""
	replace imputed_hh_income = "<$10k"      if avg_hh_income<10000  & hh_income==""
	replace imputed_hh_income = "$10k-20k"   if avg_hh_income<20000  & avg_hh_income>=10000  & hh_income==""
	replace imputed_hh_income = "$20k-30k"   if avg_hh_income<30000  & avg_hh_income>=20000  & hh_income==""
	replace imputed_hh_income = "$30k-40k"   if avg_hh_income<40000  & avg_hh_income>=30000  & hh_income==""
	replace imputed_hh_income = "$40k-50k"   if avg_hh_income<50000  & avg_hh_income>=40000  & hh_income==""
	replace imputed_hh_income = "$50k-60k" 	 if avg_hh_income<60000  & avg_hh_income>=50000  & hh_income==""
	replace imputed_hh_income = "$60k-75k" 	 if avg_hh_income<75000  & avg_hh_income>=60000  & hh_income==""
	replace imputed_hh_income = "$75k-100k"  if avg_hh_income<100000 & avg_hh_income>=75000  & hh_income==""
	replace imputed_hh_income = "$100k-150k" if avg_hh_income<150000 & avg_hh_income>=100000  & hh_income==""
	replace imputed_hh_income = "$150k-200k" if avg_hh_income<200000 & avg_hh_income>=150000  & hh_income==""
	replace imputed_hh_income = "$200k+" 	 if avg_hh_income<.      & avg_hh_income>=200000  & hh_income==""

	// IMPUTED Income groups (<30K, 30k-50k, 50k-100k, 100k+)
	gen income_grp = .
	//replace income_grp = 0 if hh_income==""
	replace income_grp = 4 if imputed_hh_income=="<$10k" 	  |imputed_hh_income=="$10k-20k"   |imputed_hh_income=="$20k-30k"
	replace income_grp = 3 if imputed_hh_income=="$30k-40k"   |imputed_hh_income=="$40k-50k"
	replace income_grp = 2 if imputed_hh_income=="$50k-60k"   |imputed_hh_income=="$60k-75k"   |imputed_hh_income=="$75k-100k"
	replace income_grp = 1 if imputed_hh_income=="$100k-150k" |imputed_hh_income=="$150k-200k" |imputed_hh_income=="$200k+"
	label define income_group 1 "100k+" 2 "50k-100k" 3 "30k-50k" 4 "<30k"
	label values income_grp income_group

	save "${ddata}temp2.dta", replace

*collapse (count) num_cat = all, by(income_grp college current_region age_grp male)
collapse (count) num_cat = all, by(income_grp educ_grp current_region age_grp male)
sum num_cat
gen share_survey = num_cat/r(sum)

save "${ddata}share_survey.dta", replace



**************************************************************
* 2. Construct the weights
**************************************************************
clear all
use "${ddata}temp2.dta", clear

cap drop _merge
merge m:1 income_grp educ_grp current_region age_grp male using  "${ddata}share_survey.dta"

cap drop _merge
merge m:1 income_grp educ_grp current_region age_grp male using  "${ddata}share_ACS.dta"

keep if _merge==3 // all obs from SCE are kept
drop _merge
gen weight_ACS = share_ACS/share_survey


///////

forvalues i=0(1)5 { 
	gen truth_`i' = .
	gen n_`i' = .
	sum group, meanonly
		forvalues j= 1/`r(max)' {
			qui sum qs2_`i' if group==`j' & control==1 [aweight= weight_ACS]
			replace truth_`i' = r(mean) if group==`j'
			qui sum all if group==`j' & control==1 [aweight= weight_ACS] 
			replace n_`i' = r(N) if group==`j'
		}

	
	gen so_perception_gap_`i'= so_mother_time_off_`i' - truth_`i'  
	gen so_gap_positive_`i'=so_mother_time_off_`i' > truth_`i' if so_mother_time_off_`i'~=.
    gen so_less_fo`i'=so_mother_time_off_`i'<qs2_`i' if so_mother_time_off_`i'~=.
	gen original_gap`i'=so_mother_time_off_`i'-info_q`i'
	gen so_original_gap_positive`i'=so_mother_time_off_`i'>info_q`i' if so_mother_time_off_`i'~=. 
}


*** Generate new variables for Tables ***
gen gap=so_mother_time_off_0-info_q0
gen gap_treatment=gap*treatment
gen working_ft=q10_1==1 if q10_1~=. 

** Generate new demographic characteristics variables as controls
cap gen child_under_6=residence_children_under6 > 0 if residence_children_under6~=.
cap gen child_under_18=residence_children_under18 > 0 if residence_children_under18~=.

cap gen lincome=log(hh_income_median)
cap gen working_ft=q10_1==1 if q10_1~=. 

gen donation_fo=qh1
gen donation_so=qh2 if male==0
replace donation_so=qh3 if male==1

gen donation_so_l_fo=donation_so>donation_fo if donation_so~=.&donation_fo~=.
////////

save  "${ddata}cleaned_data_wACSweight.dta", replace

